2025-02-05: Note that this has been altered to use the squamous cell lung cancer datasets from TCGA for TE and TM rather than the TCGA-LUAD from before

This script will include all steps in the “main pipeline” part of my thesis project. This includes differential analysis of the reference airway current vs never smoker dataset (A1, GSE63127), differential expression analysis of the TCGA-LUSC lung adenocarcinoma expression and methylation datasets, and the reference “persistent” airway current vs former vs never smoker dataset (A2, GSE7895). This includes all normalization, quality control, and filtering steps.

Loading libraries

library(EnhancedVolcano, verbose = FALSE)
## Loading required package: ggplot2
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
library(GEOquery, verbose = FALSE)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(limma, verbose = FALSE)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(umap, verbose = FALSE)
library(dplyr, verbose = FALSE)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1. Differential expression analysis of reference airway current vs never smoker dataset (A1, GSE63127)

1.1 Loading dataset

# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
#   Differential expression analysis with limma


# load series and platform data from GEO (date: 2024/10/15)
gset <- getGEO("GSE63127", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE63127_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]

# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))

# group membership for all samples
gsms <- paste0("X00100011111X00000000000X00000X000000000000X000X00",
               "XX00X0XXXXXXXXX1111111111111111111111X111X11111111",
               "XXXX1XXXXXXXXXXXXXXXXXXXXXXXX001000000010100111111",
               "01110111110011111001110011101011111001110101100011",
               "111111111111111111111111111111")
sml <- strsplit(gsms, split="")[[1]]

# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
length(sml) # 182 samples
## [1] 182
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("Non-smoker","Smoker"))
levels(gs) <- groups
gset$group <- gs

1.2 Quality control checks and normalization

## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##

## Histogram ##
png("../2_Outputs/7_Figures/A1_histogram_prenorm.png", width = 2000, height = 1500, res = 300)
hist(as.matrix(exprs(gset)),
     main = "GSE63127 overall distribution of expression values",
     xlab = "RMA normalized probe intensity",
     ylab = "Frequency") # skewed left, needs log2 transform
dev.off()
## quartz_off_screen 
##                 2
# boxplot(exprs(gset),
#         ) # scary-looking

# It takes up lots of computation so commenting out for now
max(exprs(gset))
## [1] 136808
min(exprs(gset))
## [1] 0.0657913
# Should do log2 normalization

## log2 normalization ##
exprs(gset) <- log2(exprs(gset)+1)

# quantile normalization: no longer doing this for now to better capture variation
## exprs(gset) <- normalizeBetweenArrays(exprs(gset)) 

png("../2_Outputs/7_Figures/A1_histogram_postnorm.png", width = 2000, height = 1500, res = 300)
hist(as.matrix(exprs(gset)), 
     main = "GSE63127 distribution of expression values",
     xlab = "RMA and log2 normalized probe intensity",
     ylab = "Frequency") # much better
dev.off()
## quartz_off_screen 
##                 2
# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE63127", "/", annotation(gset), " value distribution", sep ="")
plotDensities(exprs(gset), group=gs, main=title, legend ="topright")

### Generate a boxplot ###

ex <- exprs(gset)
ord <- order(gs)  # order samples by group

# Set up PNG device with high resolution
png("../2_Outputs/7_Figures/A1_boxplot.png", width = 2000, height = 1500, res = 300)
palette(
  c("#7570B3", "#1B9E77")
  )
par(mar=c(7,4,2,1))
title <- "GSE63127 sample-wise distribution of expression values"
boxplot(ex[,ord], boxwex=0.6, notch=T,  outline=FALSE, las=2, col=gs[ord],
        main=title, 
        #xlab = "Samples", 
        ylab = "RMA and log2 normalized probe intensity",
        xaxt="n",
        whisklty = 0,
        #border = gs[ord]
        lwd = 0.8,
        space = -1
        )
# Add custom x-axis without sample labels
#axis(1, at=seq_along(colnames(ex)), labels=FALSE)  # Draw ticks without labels
# Add axis labels for clarification
mtext("Samples", side=1, line=2)  # Add "Samples" as the x-axis label
# Add legend
legend("bottomleft", inset=c(0, -0.3), fill=palette(), legend=groups, 
       bty="n", xpd=TRUE)  # Adjust inset and allow drawing outside plot area
# Close the PNG device
dev.off()
## quartz_off_screen 
##                 2
min(exprs(gset))
## [1] 0.09192496
max(exprs(gset))
## [1] 17.0618

1.3 Checking and correcting batch effect / sources of variation

1.3.1: Download and clean up the phenotypic information table from the dataset

phenotypic_data <- pData(gset)  # Extract phenotypic data
head(phenotypic_data, n = 3)
##                                   title geo_accession                status
## GSM190150 Small airways, non-smoker 029     GSM190150 Public on Dec 16 2008
## GSM190153 Small airways, non-smoker 036     GSM190153 Public on Jun 17 2008
## GSM254157     small airways, smoker 112     GSM254157 Public on Jun 17 2008
##           submission_date last_update_date type channel_count
## GSM190150     May 17 2007      Aug 28 2018  RNA             1
## GSM190153     May 17 2007      Aug 28 2018  RNA             1
## GSM254157     Jan 03 2008      Aug 28 2018  RNA             1
##                                                         source_name_ch1
## GSM190150 airway epithelial cells obtained by bronchoscopy and brushing
## GSM190153 airway epithelial cells obtained by bronchoscopy and brushing
## GSM254157 airway epithelial cells obtained by bronchoscopy and brushing
##           organism_ch1 characteristics_ch1 characteristics_ch1.1
## GSM190150 Homo sapiens             age: 34                sex: M
## GSM190153 Homo sapiens             age: 45                sex: F
## GSM254157 Homo sapiens             age: 45                sex: M
##            characteristics_ch1.2                 characteristics_ch1.3
## GSM190150    ethnic group: black            smoking status: non-smoker
## GSM190153 ethnic group: hispanic            smoking status: non-smoker
## GSM254157    ethnic group: white smoking status: smoker, 23 pack-years
##           molecule_ch1
## GSM190150    total RNA
## GSM190153    total RNA
## GSM254157    total RNA
##                                                                                                      extract_protocol_ch1
## GSM190150 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM190153 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM254157 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
##           label_ch1
## GSM190150    biotin
## GSM190153    biotin
## GSM254157    biotin
##                                                                                                                                                                  label_protocol_ch1
## GSM190150   Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM190153   Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM254157 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 1-2 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
##           taxid_ch1
## GSM190150      9606
## GSM190153      9606
## GSM254157      9606
##                                                                                                                                                                                  hyb_protocol
## GSM190150 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM190153 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM254157 Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
##                                                        scan_protocol
## GSM190150 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM190153 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM254157 GeneChips were scanned using the GeneChip Scanner 3000 7G.
##                         description
## GSM190150 small airways, non-smoker
## GSM190153 small airways, non-smoker
## GSM254157 small airways, smoker 112
##                                                                                                                                                     data_processing
## GSM190150 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM190153 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM254157 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
##           platform_id           contact_name           contact_email
## GSM190150      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM190153      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM254157      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
##           contact_laboratory             contact_department
## GSM190150            Crystal Department of Genetic Medicine
## GSM190153            Crystal Department of Genetic Medicine
## GSM254157            Crystal Department of Genetic Medicine
##                       contact_institute  contact_address contact_city
## GSM190150 Weill Cornell Medical College 1300 York Avenue     New York
## GSM190153 Weill Cornell Medical College 1300 York Avenue     New York
## GSM254157 Weill Cornell Medical College 1300 York Avenue     New York
##           contact_state contact_zip/postal_code contact_country
## GSM190150            NY                   10021             USA
## GSM190153            NY                   10021             USA
## GSM254157            NY                   10021             USA
##                                                                          supplementary_file
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CEL.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CEL.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CEL.gz
##                                                                        supplementary_file.1
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CHP.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CHP.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CHP.gz
##           data_row_count                 relation               relation.1
## GSM190150          54675 Reanalyzed by: GSE119087                         
## GSM190153          54675  Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
## GSM254157          54675  Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
##           age:ch1 cilia length:ch1 copd status:ch1
## GSM190150      34             <NA>            <NA>
## GSM190153      45             <NA>            <NA>
## GSM254157      45             <NA>            <NA>
##           department of genetic medicine id:ch1 ethnic group:ch1 ethnicity:ch1
## GSM190150                                  <NA>            black          <NA>
## GSM190153                                  <NA>         hispanic          <NA>
## GSM254157                                  <NA>            white          <NA>
##           serum 25-oh-d:ch1 sex:ch1    smoking status:ch1      group
## GSM190150              <NA>       M            non-smoker Non.smoker
## GSM190153              <NA>       F            non-smoker Non.smoker
## GSM254157              <NA>       M smoker, 23 pack-years     Smoker
# The phenotypic data is terrible.
# This is filtered down to the samples that were included.
# I will first try to clean up the phenotypic data.

# So I think the features I want to keep will be:
# Dates of submission/updates etc, sex, ethnicity, smoking status
# Keep the columns that might contain data of interest, which will need to be cleaned up.

# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("geo_accession","status","submission_date","last_update_date","characteristics_ch1","characteristics_ch1.1","characteristics_ch1.2","characteristics_ch1.3","age:ch1","cilia length:ch1","ethnic group:ch1","ethnicity:ch1","serum 25-oh-d:ch1","sex:ch1","smoking status:ch1","group")

# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)

phenotypic_data <- phenotypic_data[,c(indexes)]

# Now I need to parse out sex, ethnicity, smoking status, and age, vitamin D, pack years.

#Rename "group" as "smoking status"
names(phenotypic_data)[16] <- "smoking_status"

## Grabbing ethnicity values from the columns ##
# Initialize a new column "ethnicity" with NA values
phenotypic_data$ethnicity <- NA

# Function to find 'eth' in a row and return the corresponding value
find_ethnicity <- function(row) {
  eth_column <- which(grepl('eth', row))
  if (length(eth_column) > 0) {
    return(row[eth_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "ethnicity" column
phenotypic_data$ethnicity <- apply(phenotypic_data, 1, find_ethnicity)

## Grabbing sex values from the columns ##
# Initialize a new column "sex" with NA values
phenotypic_data$sex <- NA

# Function to find 'sex' in a row and return the corresponding value
find_sex <- function(row) {
  sex_column <- which(grepl('sex', row))
  if (length(sex_column) > 0) {
    return(row[sex_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "sex" column
phenotypic_data$sex <- apply(phenotypic_data, 1, find_sex)


## Grabbing pack_years values from the columns ##
# Initialize a new column "pack_years" with NA values
phenotypic_data$pack_years <- NA

# Function to find 'pack_years' in a row and return the corresponding value, but just the first instance
find_pack_years <- function(row) {
  pack_years_column <- which(grepl('pack', row))
  if (length(pack_years_column) > 0) {
    return(row[pack_years_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "pack_years" column
phenotypic_data$pack_years <- apply(phenotypic_data, 1, find_pack_years)
#unlist the column
phenotypic_data$pack_years <- unlist(phenotypic_data$pack_years )



## Grabbing age values from the columns ##
# Initialize a new column "age" with NA values
phenotypic_data$age <- NA

# Function to find 'age' in a row and return the corresponding value
find_age <- function(row) {
  age_column <- which(grepl('age', row))
  if (length(age_column) > 0) {
    return(row[age_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "age" column
phenotypic_data$age <- apply(phenotypic_data, 1, find_age)


## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data$vitamin_d <- NA

# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
  vitamin_d_column <- which(grepl('vitamin', row))
  if (length(vitamin_d_column) > 0) {
    return(row[vitamin_d_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data$vitamin_d <- apply(phenotypic_data, 1, find_vitamin_d)

## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data$vitamin_d <- NA

# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
  vitamin_d_column <- which(grepl('vitamin', row))
  if (length(vitamin_d_column) > 0) {
    return(row[vitamin_d_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data$vitamin_d <- apply(phenotypic_data, 1, find_vitamin_d)

## Grabbing cilia values from the columns ##
# Initialize a new column "cilia_length" with NA values
phenotypic_data$cilia_length <- NA

# Function to find 'cilia' in a row and return the corresponding value, first instance
find_cilia <- function(row) {
  cilia_column <- which(grepl('cilia', row))
  if (length(cilia_column) > 0) {
    return(row[cilia_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "cilia" column
phenotypic_data$cilia_length <- apply(phenotypic_data, 1, find_cilia)



## Now cut out the messy columns
phenotypic_data <- phenotypic_data[,-c(5:15)]

## Remove unnecessary prefix info
phenotypic_data$ethnicity <- gsub(".*: ", "", phenotypic_data$ethnicity )
phenotypic_data$age <- gsub(".*: ", "", phenotypic_data$age)
phenotypic_data$sex <- gsub(".*: ", "", phenotypic_data$sex)
phenotypic_data$vitamin_d <- gsub(".*: ", "", phenotypic_data$vitamin_d)
phenotypic_data$cilia_length <- gsub(".*: ", "", phenotypic_data$cilia_length)

phenotypic_data$pack_years<- gsub(".*, ", "", phenotypic_data$pack_years)
phenotypic_data$pack_years<- gsub("pack-years", "", phenotypic_data$pack_years)


# Reformat the submission dates to be sortable

phenotypic_data <- phenotypic_data %>%
  mutate(submission_date = ifelse(submission_date == "Dec 20 2012", "2012-12-20", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jan 03 2008", "2008-01-08", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jan 31 2013", "2013-01-31", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jun 03 2010", "2010-06-03", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jun 13 2008", "2008-06-13", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "May 17 2007", "2007-05-17", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Nov 08 2013", "2013-11-08", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Nov 10 2014", "2014-11-10", submission_date))

head(phenotypic_data, n = 3)
##           geo_accession                status submission_date last_update_date
## GSM190150     GSM190150 Public on Dec 16 2008      2007-05-17      Aug 28 2018
## GSM190153     GSM190153 Public on Jun 17 2008      2007-05-17      Aug 28 2018
## GSM254157     GSM254157 Public on Jun 17 2008      2008-01-08      Aug 28 2018
##           smoking_status ethnicity sex pack_years age vitamin_d cilia_length
## GSM190150     Non.smoker     black   M       <NA>  34      <NA>         <NA>
## GSM190153     Non.smoker  hispanic   F       <NA>  45      <NA>         <NA>
## GSM254157         Smoker     white   M        23   45      <NA>         <NA>

1.3.2: Plot PCA and use phenotypic information to look for sources of batch effect/variation, and correct for these with ComBat

# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("Non-smoker","Smoker"))
levels(gs) <- groups
gset$group <- gs


## Plot PCA 1 ##
colz <- as.numeric(as.factor(gs)) # Get color values from group

plotMDS(exprs(gset),
        gene.selection = "common",
       # main = "PCA for CS vs NS GSE63127",
        col = colz,
        pch = 1
)
legend("bottom", 
       legend = c("Smoker", "Non-smoker"), 
       col = c("#7570B3", "#1B9E77"), # Colors: only for smoking status
       pch = c(15, 15),                   # Shapes: 2 = triangle, 1 = circle
       pt.cex = c(1, 1),             # Adjust size for better visibility
       text.col = "black",                     # Text color
#       bty = "n"
       )                              # Box type: 'n' removes border

## We have 4 definite clusters that are not based on smoking status. 
## As such, it is a good idea to check the table of sample phenotypic information to look for sources of variation between samples.


pointz <- as.numeric(as.factor(phenotypic_data$submission_date<= "2010-06-03")) # Get point shape values from date of submission: split into 2010 and earlier, post-2010]

## Plot PCA with date information##
plotMDS(exprs(gset),
        gene.selection = "common",
       # main = "PCA for CS vs NS GSE63127",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz
        #labels = gset$group
)
legend("bottom", 
       legend = c("Smoker", "Non-smoker", 
                  "2010 and Prior", "Post-2010"), 
       col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
       pch = c(15, 15, 2, 1),                   # Shapes: 2 = triangle, 1 = circle
       pt.cex = c(1, 1, 1, 1),             # Adjust size for better visibility
       text.col = "black",                     # Text color
#       bty = "n"
       )                              # Box type: 'n' removes border

# Clearly the source of batch effect in PC1 is submission date post-2010.
# Note: I found that the split was at 2010 by doing a bit of playing around with other clustering methods, not shown here.

# First batch correction (submission date)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.3.3
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
library(limma)

# Making a batch vector
submission_post_2010_batch <- ifelse(phenotypic_data$submission_date < as.Date("2012-01-01"), 1, 2)

# Adjust the expression matrix for submission date batch effect
exprs_matrix_combat <- ComBat(dat=exprs(gset), batch=submission_post_2010_batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Plot PCA for expression values after first batch correction ##
date_corrected_PCA <- plotMDS(exprs_matrix_combat,
        gene.selection = "common",
       # main = "PCA for CS vs NS GSE63127, corrected for submission date",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz

)
legend("bottom",
legend = c("Smoker", "Non-smoker", 
                  "2010 and Prior", "Post-2010"), 
       col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
       pch = c(15, 15, 2, 1)
       #title = "Smoking status"
       )

## Some evidence that second source of variation could be due to sex (but only 11/182 samples have sex labels):
plotMDS(exprs_matrix_combat,
        gene.selection = "common",
       # main = "PCA for CS vs NS GSE63127, corrected for submission date",
        col = colz, # Colors smokers red and nonsmokers black
        #pch = pointz2 # Using separate shapes for all submission dates
        labels = phenotypic_data$sex
)
legend("bottom",
       legend = c("Smoker", "Non-smoker", "M = Male", "F = Female"),
       col = c("#7570B3", "#1B9E77", "black", "black"),
       pch = c(15, 15, NA, NA)
       #title = "Smoking status"
       )

## Samples are divided by sex, but 11/182 samples is not enough to draw a conclusion here.

## Second correction for unknown source of variation using ComBat: ##

# Assign batch labels based on the first dimension from MDS (equivalent to PC1), since the dividing line for the batches lies at 0
unknown_batch_labels <- ifelse(date_corrected_PCA$x < 0, 1, 2)

# Do a second batch correction
exprs_matrix_combat_2 <- ComBat(dat=exprs_matrix_combat, batch=unknown_batch_labels, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# View PCA plot
plotMDS(exprs_matrix_combat_2,
        gene.selection = "common",
        main = "PCA for CS vs NS GSE63127 after second correction",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz
        #labels = gset$group
)
legend("topleft", 
       legend = c("Smoker", "Non-smoker", 
                  "2010 and Prior", "Post-2010"), 
       col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
       pch = c(15, 15, 2, 1),                   # Shapes: 2 = triangle, 1 = circle
       pt.cex = c(1, 1, 1, 1),             # Adjust size for better visibility
       text.col = "black",                     # Text color
#       bty = "n"
       )  

## Now PC1 corresponds quite well to smoking status after the two ComBat corrections.


### 2024/12/13 NOTE: Maybe I should be using SVA as a correction for the unknown source of variation rather than ComBat since this is not really a batch effect?

1.4 Differential expression analysis (limma with vooma)

# Finish setting up the design matrix
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)

## Crucial bit: Replace the expression values in gset with the batch corrected ones ##
exprs(gset) <- as.matrix(exprs_matrix_combat_2)

# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

# OR weights by group
# v <- voomaByGroup(gset, group=groups, design, plot=T, cex=0.1, pch=".", col=1:nlevels(gs))
v$genes <- fData(gset) # attach gene annotations

# fit linear model
fit  <- lmFit(v)

# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[2], groups[1], sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)

# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)

tT <- subset(tT, select=c("ID","Gene.symbol","logFC","adj.P.Val"))

1.5 Basic filtering of DEGs (unlabelled, duplicates)

GSE63127_CS_NS_limma <- tT %>%
  filter(Gene.symbol != "") %>% # Remove blank gene symbols
  group_by(Gene.symbol) %>%
  slice_min(adj.P.Val, with_ties = TRUE) %>% 
  # For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
  ungroup()
head(GSE63127_CS_NS_limma)
## # A tibble: 6 × 4
##   ID          Gene.symbol  logFC adj.P.Val
##   <chr>       <chr>        <dbl>     <dbl>
## 1 229819_at   A1BG        -0.106    0.481 
## 2 232462_s_at A1BG-AS1     0.531    0.0224
## 3 220951_s_at A1CF         0.302    0.123 
## 4 1558450_at  A2M          0.110    0.453 
## 5 1564139_at  A2M-AS1     -0.138    0.0724
## 6 1553505_at  A2ML1        0.145    0.636
# Checking for ties
ties <- GSE63127_CS_NS_limma %>%
  group_by(Gene.symbol) %>%
  filter(n() > 1) %>%
  ungroup()
print(ties) # No ties
## # A tibble: 0 × 4
## # ℹ 4 variables: ID <chr>, Gene.symbol <chr>, logFC <dbl>, adj.P.Val <dbl>
nrow(GSE63127_CS_NS_limma)
## [1] 22189

1.8 Visualization of DEGs (volcano plot)

FDR_cutoff_A1 <- 0.01
log2FC_cutoff_A1 <- 0.25

v_A1 <- EnhancedVolcano::EnhancedVolcano(
  toptable = GSE63127_CS_NS_limma,
  lab = GSE63127_CS_NS_limma$Gene.symbol,
  x = "logFC", # "mean difference" is estimate here
  y = "adj.P.Val", 
 # pCutoffCol = 'min_smoothed_fdr',
  xlab = "log2FC",
  ylab = "-log10(FDR)",
  title = "A1 DEGs",
  subtitle = paste0("log2FC cutoff: ", log2FC_cutoff_A1, "    FDR cutoff: ", FDR_cutoff_A1),
  caption = paste0("Total = ", nrow(GSE63127_CS_NS_limma[abs(GSE63127_CS_NS_limma$logFC)>log2FC_cutoff_A1 &  GSE63127_CS_NS_limma$adj.P.Val < FDR_cutoff_A1,]), " significant DEGs meeting cutoffs"),
  col = c("grey30", "mediumpurple2", "royalblue", "orange2"),
  legendPosition = "bottom",
  labSize = 3,
  max.overlaps = 3,
  drawConnectors = TRUE,
  widthConnectors = 0.3,
  arrowheads = FALSE,
  pCutoff = FDR_cutoff_A1,
  FCcutoff = log2FC_cutoff_A1,
  gridlines.minor = FALSE,
  gridlines.major = FALSE,
  xlim = c(-3, 6)
)

v_A1
## Warning: ggrepel: 2619 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

1.7 Save outputs

# Change date suffix as appropriate if changes are made
#write.table(GSE63127_CS_NS_limma, "../2_Outputs/1_Airway_DEGs/GSE63127_CS_NS_limma_20241204.txt", sep = '\t')

1.8 Extra checks

2024/11/07: Comparing the DEGs list when quantile normalization is used vs not used:

GSE63127_CS_NS_GEO2R_limma_sig_quantile <- read.table("../2_Outputs/GSE63127_CS_NS_GEO2R_limma_sig_20241016.txt", header = TRUE)
GSE63127_CS_NS_GEO2R_limma_sig_no_quantile <- GSE63127_CS_NS_limma[GSE63127_CS_NS_limma$adj.P.Val<0.05,]

# Compare results
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
## 
## Attaching package: 'futile.logger'
## The following object is masked from 'package:mgcv':
## 
##     scat
venn <- venn.diagram(
  list(
    DEGs_no_quantile = GSE63127_CS_NS_GEO2R_limma_sig_no_quantile$Gene.symbol,
    DEGs_quantile = GSE63127_CS_NS_GEO2R_limma_sig_quantile$Gene.symbol
  ),
  filename = NULL
)
# Display the diagram
grid.newpage()
grid.draw(venn)

The lists agree quite well. The list without quantile normalization is larger. Quantile normalization could be over-normalization and mask some variation. The PCA plots still look good without quantile normalization. I will elect to go forward without quantile normalization. I will have to apply the same to all the other airway datasets for the meta-analysis bit (i.e. do not do quantile normalization for them).


2. Differential expression analysis of TCGA-LUAD tumor vs normal tissue (T-E)

2.1.1.1 Loading dataset

library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)

query <- GDCquery(project = "TCGA-LUSC",
                  data.category = "Transcriptome Profiling",
                  data.type = "Gene Expression Quantification",
                  sample.type = c("Primary Tumor", "Solid Tissue Normal"),
                  workflow.type = "STAR - Counts")

GDCdownload(query)

data <- GDCprepare(query)

2.1.1.2 Download clinical data

query_clinical <- GDCquery(
  project = "TCGA-LUSC",
  data.category = "Clinical",
  data.type = "Clinical Supplement"
)

# Check the available files
query_clinical$results[[1]] %>% head()

## I see that some are not BCR XML files, so I will try to remove these
query_clinical$results[[1]] <- query_clinical$results[[1]] %>%
  filter(data_format == "BCR XML")

## Download data
GDCdownload(query_clinical)
clinical_data <- GDCprepare_clinic(query_clinical, clinical.info = "patient")

head(clinical_data)

2.1.2 Formatting as tumor-normal pairs, removing never-smoker samples

counts <- as.data.frame(assay(data))  # Extracting the count matrix (these are supposedly raw counts)
#head(counts)  # Viewing the first few rows (genes) and columns (samples)

gene_info <- as.data.frame(rowData(data))
head(gene_info)  # Preview the first few genes and their annotations
##                    source type score phase            gene_id      gene_type
## ENSG00000000003.15 HAVANA gene    NA    NA ENSG00000000003.15 protein_coding
## ENSG00000000005.6  HAVANA gene    NA    NA  ENSG00000000005.6 protein_coding
## ENSG00000000419.13 HAVANA gene    NA    NA ENSG00000000419.13 protein_coding
## ENSG00000000457.14 HAVANA gene    NA    NA ENSG00000000457.14 protein_coding
## ENSG00000000460.17 HAVANA gene    NA    NA ENSG00000000460.17 protein_coding
## ENSG00000000938.13 HAVANA gene    NA    NA ENSG00000000938.13 protein_coding
##                    gene_name level    hgnc_id          havana_gene
## ENSG00000000003.15    TSPAN6     2 HGNC:11858 OTTHUMG00000022002.2
## ENSG00000000005.6       TNMD     2 HGNC:17757 OTTHUMG00000022001.2
## ENSG00000000419.13      DPM1     2  HGNC:3005 OTTHUMG00000032742.2
## ENSG00000000457.14     SCYL3     2 HGNC:19285 OTTHUMG00000035941.6
## ENSG00000000460.17  C1orf112     2 HGNC:25565 OTTHUMG00000035821.9
## ENSG00000000938.13       FGR     2  HGNC:3697 OTTHUMG00000003516.3
sample_info <- as.data.frame(colData(data))
#head(sample_info)  # Preview sample metadata

table(sample_info$sample_type)  # Summarize sample types (Tumor vs. Normal)
## 
##       Primary Tumor Solid Tissue Normal 
##                 511                  51
# Extract just the normal sample info
sample_info_normal <- sample_info[sample_info$definition=="Solid Tissue Normal",] %>%
    filter (tissue_or_organ_of_origin != "Blood")# Remove the weird leukemia blood sample.

# Look for tumor samples with normal matches from same patients
sample_info_tumor <- sample_info %>%
  filter(patient %in% sample_info_normal$patient) %>%
  filter(definition == "Primary solid Tumor") %>%
  filter (tissue_or_organ_of_origin != "Blood")# Remove the weird leukemia blood sample.


# Make the matched tumor-normal sample table
sample_info_matched_T_NM <- rbind(sample_info_tumor, sample_info_normal)[order(c(seq_len(nrow(sample_info_tumor)), seq_len(nrow(sample_info_normal)))), ]

sample_info_matched_T_NM <- sample_info_matched_T_NM %>% 
  dplyr::select(-treatments) %>% # Removing treatments column since it is in the form of a list and has no info
  arrange(., sample_type_id) %>% # First sort by tumor vs normal
  arrange(., patient) # arrange by patient to get the tumor normal pairs


### Remove never smoker samples from the sample info table ###

# A tobacco history label of 1 corresponds to never smokers: https://pmc.ncbi.nlm.nih.gov/articles/PMC7427766/

# Merge the clinical sample table with tobacco smoking history from clinical table and remove never smokers
sample_info_matched_T_NM <- sample_info_matched_T_NM %>%
  left_join(., clinical_data %>% select( "bcr_patient_barcode", "tobacco_smoking_history"), by = c("patient"= "bcr_patient_barcode")) 

#There are no never-smokers in the cohort (none have smoking status of 1)


### Modify the counts table for tumor-normal matched data ###

# Keep the counts columns of sample labels that are in the T-NM matched info
sample_barcodes <- as.character(sample_info_matched_T_NM$barcode)
counts_matched_T_NM <- counts %>%
  dplyr::select(all_of(sample_barcodes))

# Rename with sample label instead of sample barcode
names(counts_matched_T_NM) <- sample_info_matched_T_NM$sample

2.2.1 Quality control checks

library(dplyr)
library(edgeR)
## Warning: package 'edgeR' was built under R version 4.3.2
# Checking distribution of the whole counts table
hist(as.matrix(counts_matched_T_NM)) # whoa

hist(log2(as.matrix(counts_matched_T_NM))) # Still not normal at all

# Checking distribution of just tumor samples
# counts_matched_T <- counts_matched_T_NM %>%
#   dplyr::select(seq(1, ncol(counts_matched_T_NM), by = 2))
# hist(log2(as.matrix(counts_matched_T))) # Equally bad distribution, why is it the same though??
# 
# # Checking distribution of just normal samples
# counts_matched_NM <- counts_matched_T_NM %>%
#   dplyr::select(seq(2, ncol(counts_matched_T_NM), by = 2))
# hist(log2(as.matrix(counts_matched_NM))) # Equally bad distribution, why is it the same though????


#boxplot(counts_matched_T_NM) # Commenting out due to computation time

# Boxplots for all counts looks crazy, but apparently this is common in RNAseq data and is accounted for by DESeq2 by a size faxctor approach (can back up later with a source)


## PCA to check for tumor-normal separation
#colz <- as.numeric(as.factor(rep(c(1,0), length(counts_matched_T_NM)/2))) # Get color values from group
colz <- rep(c("firebrick2","black"), length(counts_matched_T_NM)/2 ) # Get color values from group

plotMDS(counts_matched_T_NM,
        gene.selection = "common",
        main = "PCA for TCGA-LUSC expression",
        col = colz,
        pch = 1
)
legend("bottomleft", 
       legend = c("Tumor", "Normal"), 
       col = c("firebrick2", "black"), 
       pch = c(15, 15),                
       text.col = "black"
       )  

# Separate but not very good separation, 1 definite outlier.
# To find the outlier, plotting PCA with sample names
plotMDS(counts_matched_T_NM,
        gene.selection = "common",
        main = "PCA for TCGA-LUSC expression",
         col = colz  
        #pch = 1
)
legend("bottomleft", 
       legend = c("Tumor", "Normal"), 
       col = c("firebrick2", "black"), # Colors: only for smoking status
       pch = c(15, 15),                
       text.col = "black"
       )  

##  Making a dendrogram to see if the same outlier is found
sample_dist <- dist(t(counts_matched_T_NM))  # Transpose the matrix to calculate distances between samples
hc <- hclust(sample_dist) #Perform hierarchical clustering
plot(hc, main = "Dendrogram of Samples", xlab = "", sub = "", cex = 0.8) # Plot the dendrogram

2.2.2 Acting on quality control checks: Remove outlier and its pair

## It appears that an outlier candidate is TCGA-56-8623-01A
## There may be others, I also don't see this on a blacklist when I search for it, but I will elect to remove it and its pair


counts_matched_T_NM <- counts_matched_T_NM %>% dplyr::select(-c("TCGA-56-8623-01A","TCGA-56-8623-11A"))

## PCA to check for tumor-normal separation with outlier removed
colz2 <- as.numeric(as.factor(rep(c(0,1), length(counts_matched_T_NM)/2))) # Get color values from group
plotMDS(counts_matched_T_NM,
        gene.selection = "common",
        main = "PCA for TCGA-LUSC expression after outlier removal",
        col = colz2,
        pch = 1
)
legend("bottomleft", 
       legend = c("Tumor", "Normal"), 
       col = colz2, # Colors: only for smoking status
       pch = c(15, 15),                
       text.col = "black"
       )  

## Saving this version of the T-NM matched counts
#write.table(counts_matched_T_NM, "../2_Outputs/3_Tumor_expression/TCGA_LUAD_counts_matched_T_NM_20241125.txt")

The matrices have messy boxplots and histograms, but since I am using the signed-rank test, it does not suppose require normally distributed data, so I have decided to go with this raw counts matrix for now.

2024/12/04 In an earlier version of the script I used a Wilcoxon signed-rank test since I read in a paper that a Wilcoxon-based method was the most accurate method (Li et al. Genome Biology (2022) 23:79)

I wrote some code based on below: Source code: https://github.com/xihuimeijing/DEGs_Analysis_FDR/blob/main/scripts/DEGs.R Accessed 2023/08/26

Tutorial: https://rpubs.com/LiYumei/806213 Accessed 2023/08/31

Unlike the tutorial, here I performed a signed-rank test rather than a rank-sum test, as the samples are not independent (they are matched tumor and normal samples).

However, I later decided to switch to DESeq2 since this is the preferred, standard method for large sample sizes in RNAseq (find a good source to back it up).

It seems that the Wilcoxon signed-rank test may be much more suitable for assessing a handful of genes rather than whole-transcriptome analysis. DESeq2 is more typically used for the latter, despite the finding of the publication listed above. I will compare the results using DESeq2.

Below I show the limma version.

Note that I include a filtering step to remove genes with low counts: “Filter out rows with less than 10 total counts in the smallest sample group size” The smallest group (tumor or normal) has 44 members, so this filters out any genes that cannot reach a sum of 10 counts using 44 samples.

Differential expression analysis using limma for consistency

# Create a design matrix
groups <- rep(c("Tumor","Normal"), length(counts_matched_T_NM)/2)
design <- model.matrix(~0 + groups)

# "Condition" should indicate Tumor/Normal status
colnames(design) <- c("Normal", "Tumor")

# Filter out rows with less than 10 total counts in the smallest sample group size (88/2 = 44)
keep <- rowSums(counts_matched_T_NM >= 10) >= 44
counts_matched_T_NM <- counts_matched_T_NM[keep,]

# Use voom to transform the counts
voom_data <- voom(counts_matched_T_NM, design)

# Fit the model
fit <- lmFit(voom_data, design)

# Define contrasts (Tumor vs Normal)
contrast_matrix <- makeContrasts(Tumor - Normal, levels = design) 
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)

# Get top differentially methylated probes
tT <- topTable(fit2, adjust = "BH", number = Inf)

# Giving more descriptive name and colnames to list
TCGA_LUSC_DEG <- tT
TCGA_LUSC_DEG$Gene <- rownames(TCGA_LUSC_DEG)
rownames(TCGA_LUSC_DEG) <- NULL
TCGA_LUSC_DEG <- TCGA_LUSC_DEG %>%
  dplyr::select(., Gene, logFC, adj.P.Val) %>%
  dplyr::rename(., log2FC = logFC, FDR = adj.P.Val)

# Giving a yet more descriptive name
TCGA_LUSC_limma_DEG <- TCGA_LUSC_DEG


### Replace ensembl IDs with gene names
TCGA_LUSC_limma_DEG <- TCGA_LUSC_limma_DEG %>% arrange(., rownames(.))
gene_info_sorted <- gene_info %>% 
  arrange(., gene_id) %>%
  filter(gene_id %in% TCGA_LUSC_limma_DEG$Gene)

TCGA_LUSC_limma_DEG$Gene <- gene_info_sorted$gene_name

# Save output
write.table(TCGA_LUSC_limma_DEG, "../2_Outputs/4_Tumor_DEGs/TCGA_LUSC_limma_DEG_20250130.txt", sep = '\t')

Visualization of the DEGs

log2FC_cutoff_TE <- 0.75
FDR_cutoff_TE <- 0.001

v_TE <- EnhancedVolcano::EnhancedVolcano(
  toptable = TCGA_LUSC_limma_DEG,
  lab = TCGA_LUSC_limma_DEG$Gene,
  x = "log2FC",
  y = "FDR", 
 # pCutoffCol = 'min_smoothed_fdr',
  xlab = "log2FC",
  ylab = "-log10(FDR)",
  title = "TE DEGs",
  subtitle = paste0("log2FC cutoff: ", log2FC_cutoff_TE, "   FDR cutoff: ", FDR_cutoff_TE),
  caption = paste0("Total = ", nrow(TCGA_LUSC_limma_DEG[abs(TCGA_LUSC_limma_DEG$log2FC)>log2FC_cutoff_TE & TCGA_LUSC_limma_DEG$FDR < FDR_cutoff_TE,]), " significant DEGs meeting cutoffs"),
  col = c("grey30", "mediumpurple2", "royalblue", "orange2"),
  legendPosition = "bottom",
  labSize = 3,
  max.overlaps = 10,
  drawConnectors = TRUE,
  widthConnectors = 0.3,
  arrowheads = FALSE,
  pCutoff = FDR_cutoff_TE,
  FCcutoff = log2FC_cutoff_TE,
  gridlines.minor = FALSE,
  gridlines.major = FALSE,
  xlim = c(-3, 5),
  ylim = c(0,10)
)

v_TE
## Warning: ggrepel: 3580 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps


3. Differential methylation analysis of TCGA-LUAD tumor vs normal tissue (T-M)

I downloaded this level 3 methylation 450k data from cBioPortal, from TCGA Lung Adenocarcinoma (Firehose Legacy) https://www.cbioportal.org/study/summary?id=luad_tcga (Accessed 2023/08/29)

Note that this provides gene information but not probe information. According to the metadata, “Methylation (HM450) beta-values for genes in 492 cases. For genes with multiple methylation probes, the probe most anti-correlated with expression.”

2024/12/04:

I did a lot of work trying to do the analysis starting from probe level information, but ultimately decided to stick with the cbioportal pre-processed table for now. I may want to revisit the probewise analysis later once I am more confident in the output.

I decided I should switch from wilcoxon sign-rank test to limma because this is more typically used for large and complex datasets like TCGA with differential methylation analysis (look for source).

3.1 Loading dataset downloaded from cbioportal

data_methylation_hm450_tumor <- read.table("../4_TCGA_data/TCGA_LUSC/data_methylation_hm450_LUSC.txt", header=TRUE, fill=TRUE)


data_methylation_hm450_normal <- read.table("../4_TCGA_data/TCGA_LUSC/data_methylation_hm450_normals_LUSC.txt", header=TRUE, fill=TRUE)

3.2 Formatting dataset

3.2.1 Formatting counts in tumor-normal pairs

allIDs_tumor  <- colnames(data_methylation_hm450_tumor)
allIDs_normal <- colnames(data_methylation_hm450_normal)


## 2025/01/29: Remove never smokers if there are any

## Remove suffix from normal IDs and replace . with -
allIDs_normal_patients <- allIDs_normal %>%
  gsub("*.11","", .) %>%
  gsub("\\.", "-", .)

## Filter the clinical info to those patient IDs without never smokers
clinical_data_filtered <- clinical_data %>%
  filter(bcr_patient_barcode %in% allIDs_normal_patients) %>%
  filter(tobacco_smoking_history != 1) # Remove never smokers

## Filter out the corresponding IDs from allIDs_normal
allIDs_normal_filtered <- clinical_data_filtered$bcr_patient_barcode%>%
  gsub("-","\\.", .) %>%
  lapply(., function(x) paste0(x, ".11")) %>%
  unlist()
length(allIDs_normal_filtered)
## [1] 40
#Listing IDs of tumors that have matched normals by changing the tissue ID to the "tumor" identifier, "01", for matching purposes.
IDs_tumor_with_matches <-gsub("*.11",".01", allIDs_normal_filtered)
length(IDs_tumor_with_matches)
## [1] 40
#Make a table of the methylation data for tumor samples only with matching normal data.
data_methylation_hm450_tumor_with_matches <- data_methylation_hm450_tumor %>%
  dplyr::select(any_of(IDs_tumor_with_matches))
length(data_methylation_hm450_tumor_with_matches)
## [1] 38
# Somehow I am losing two samples here. Check which samples those are
missing_tumor_samples <- IDs_tumor_with_matches[!IDs_tumor_with_matches %in% colnames(data_methylation_hm450_tumor_with_matches)]
missing_tumor_samples
## [1] "TCGA.43.3394.01" "TCGA.43.3920.01"
# OK, I will remove the corresponding normal IDs from the normals


#Make a table of the methylation data for normal samples only with matching tumor data.
data_methylation_hm450_normal_with_matches <- data_methylation_hm450_normal %>%
  dplyr::select(allIDs_normal_filtered) %>%
  dplyr::select(! c("TCGA.43.3394.11", "TCGA.43.3920.11")) # Remove the samples with no matching tumor sample
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(allIDs_normal_filtered)
## 
##   # Now:
##   data %>% select(all_of(allIDs_normal_filtered))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
length(data_methylation_hm450_normal_with_matches)
## [1] 38
# There are still 2 more normals than tumors. Check what doesn't match here.


#Make a combined table of matched tumor-normal samples.
data_methylation_hm450_tumor_normal_matched <- cbind(data_methylation_hm450_tumor_with_matches, data_methylation_hm450_normal_with_matches)[order(c(1:length(data_methylation_hm450_tumor_with_matches),1:length(data_methylation_hm450_normal_with_matches)))]

# Adding back gene labels
data_methylation_hm450_tumor_normal_matched <- cbind(Gene = data_methylation_hm450_tumor$Hugo_Symbol, data_methylation_hm450_tumor_normal_matched) 

3.2.2 Handling duplicate genes

library(dplyr)

# Step 1: Identify rows with duplicate Gene values
duplicates <- data_methylation_hm450_tumor_normal_matched %>%
  group_by(Gene) %>%
  filter(n() > 1)

# Step 2: Remove the row with the smallest row sum for each duplicate gene
rows_to_remove <- duplicates %>%
  rowwise() %>%
  mutate(row_sum = sum(c_across(where(is.numeric)))) %>%  # Compute row sum for numeric columns
  group_by(Gene) %>%
  slice_min(row_sum, with_ties = FALSE) %>%  # Select the row with the smallest sum
  ungroup()

# Step 3: Remove these rows from the original dataframe
data_methylation_hm450_tumor_normal_matched <- anti_join(data_methylation_hm450_tumor_normal_matched, rows_to_remove, by = colnames(data_methylation_hm450_tumor_normal_matched))

# Now make the gene names into row names
rownames(data_methylation_hm450_tumor_normal_matched) <- data_methylation_hm450_tumor_normal_matched$Gene
data_methylation_hm450_tumor_normal_matched$Gene <- NULL

3.3 Quality control checks and conversion to M values for differential analysis

hist(as.matrix(data_methylation_hm450_tumor_normal_matched), breaks = 75)

boxplot(data_methylation_hm450_tumor_normal_matched)

max(!is.na(data_methylation_hm450_tumor[3:length(data_methylation_hm450_tumor_normal_matched)]))
## [1] 1
min(!is.na(data_methylation_hm450_tumor[3:length(data_methylation_hm450_tumor_normal_matched)]))
## [1] 0
# Beta values normally have a bimodal distribution so it's not really unusual to see this

## Conversion from beta to M values ##

# Shorter name for convenience
methyl_beta <- data_methylation_hm450_tumor_normal_matched
# Convert to M values
methyl_M=log2(data_methylation_hm450_tumor_normal_matched/(1-data_methylation_hm450_tumor_normal_matched))

## Plot a histograk of the M values
hist(as.matrix(methyl_M),
     main = "Distribution of M-values in TCGA-LUSC methylation",
     xlab = "M-value",
     breaks = 75) ## Closer to normal distribution though still definitely not a perfect one

## Plot the expression distribution of the individual samples
title <- "TCGA-LUSC methylation value distribution"
colz <- rep(c("Tumor","Normal"), length(methyl_M)/2)
plotDensities(as.matrix(methyl_M), group=colz, main=title, legend ="topright", col = c("darkolivegreen3","brown2"))

### PCA checks ###

## PCA to check for tumor-normal separation with outlier removed
colz <- rep(c("red","black"), length(methyl_M)/2) # Get color values from T or NM group
plotMDS(methyl_M,
        gene.selection = "common",
        main = "PCA for TCGA-LUSC methylation data",
        col = colz,
        pch = 1
)

legend("bottomleft", 
       legend = c("Tumor", "Normal"), 
       col = c("red", "black"),
       pch = 15
       )  

## Very good tumor-normal separation

3.4 Differential methylation analysis using limma

# Create a design matrix
groups <- rep(c("Tumor","Normal"), length(methyl_M)/2)
design <- model.matrix(~0 + groups)

# "Condition" should indicate Tumor/Normal status
colnames(design) <- c("Normal", "Tumor")

# Fit the model
fit <- lmFit(methyl_M, design)

# Define contrasts (Tumor vs Normal)
contrast_matrix <- makeContrasts(Tumor - Normal, levels = design) 
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)

# Get top differentially methylated probes
tT <- topTable(fit2, adjust = "BH", number = Inf)

# Giving more descriptive name and colnames to list
TCGA_LUSC_DMG <- tT
TCGA_LUSC_DMG$Gene <- rownames(TCGA_LUSC_DMG)
rownames(TCGA_LUSC_DMG) <- NULL
TCGA_LUSC_DMG <- TCGA_LUSC_DMG %>%
  dplyr::select(., Gene, logFC, adj.P.Val) %>%
  dplyr::rename(., log2FC = logFC, FDR = adj.P.Val)

# Giving a yet more descriptive name
TCGA_LUSC_limma_DMG <- TCGA_LUSC_DMG

nrow(TCGA_LUSC_limma_DMG)
## [1] 16706
### Saving output
#write.table(TCGA_LUAD_limma_DMG, "../2_Outputs/5_Tumor_DMGs/TCGA_LUSC_limma_DMG_20250131.txt", sep = '\t')

3.5 Visualization of DMGs (volcano plot)

log2FC_cutoff_TM <- 0.25
FDR_cutoff_TM <- 0.01

v_TM <- EnhancedVolcano::EnhancedVolcano(
  toptable = TCGA_LUSC_limma_DMG,
  lab = TCGA_LUSC_limma_DMG$Gene,
  x = "log2FC",
  y = "FDR", 
 # pCutoffCol = 'min_smoothed_fdr',
  xlab = "log2FC",
  ylab = "-log10(FDR)",
  title = "TM DMGs",
  subtitle = paste0("log2FC cutoff: ", log2FC_cutoff_TM, "   FDR cutoff: ", FDR_cutoff_TM),
  caption = paste0("Total = ", nrow(TCGA_LUSC_limma_DMG[abs(TCGA_LUSC_limma_DMG$log2FC)>log2FC_cutoff_TM & TCGA_LUSC_limma_DMG$FDR < FDR_cutoff_TM,]), " significant DMGs meeting cutoffs"),
  col = c("grey30", "mediumpurple2", "royalblue", "orange2"),
  legendPosition = "bottom",
  labSize = 3,
  max.overlaps = 10,
  drawConnectors = TRUE,
  widthConnectors = 0.3,
  arrowheads = FALSE,
  pCutoff = FDR_cutoff_TE,
  FCcutoff = log2FC_cutoff_TE,
  gridlines.minor = FALSE,
  gridlines.major = FALSE,
  xlim = c(-3, 3),
  ylim = c(0,10)
)

v_TM
## Warning: ggrepel: 2515 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps


4. Differential expression analysis of reference reference “persistent” airway current vs former vs never smoker dataset (A2)

4.1 Loading dataset

# load series and platform data from GEO

gset <- getGEO("GSE7895", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE7895_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL96", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]

# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))

# group membership for all samples
gsms <- paste0("22222222222222222222200000000000000000000000000000",
               "00000000000000000000000111111111111111111111111111",
               "1111")
sml <- strsplit(gsms, split="")[[1]]

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values

# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("current_smoker","former_smoker","never_smoker"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values

4.2 Quality control checks and normalization

2024/12/04: My real reason for using quantile normalization is that I obtained more DEGs when I did it this way. But to give a better justification I could point out that the boxplots look quite variable and it seemed appropriate to normalize the expression distributions.

4.2.1 Initial checks (histogram, boxplot, PCA), and quantile normalization

## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##

hist(as.matrix(exprs(gset))) # Values range 1-15, and 1 big peak around 3. 

boxplot(exprs(gset)) # Same range of values, with similar-looking ranges, but not exactly the same

# Narrow range, therefore no log2 normalization needed

## Going to try this again without the quantile to see what happens

# exprs(gset) <- normalizeBetweenArrays(exprs(gset))
# boxplot(exprs(gset))
# 2024/11/12: I elected to do quantile normalization because this gave me a larger list of "persistent" genes. Could justify that it "better captures the variation between groups" etc

min(exprs(gset))
## [1] -0.3312336
max(exprs(gset))
## [1] 15.29594
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
        gene.selection = "common",
        main = "PCA for GSE7895",
        col = colz,
        pch = 1
        #labels = gs
        )

legend("topright", legend = levels(as.factor(gs)), 
       fill = unique(colz), 
       title = "Smoking status")

# No separation, all mixed up. This isn't a good look.

4.2.2 Investigating sources of variation that could account for lack of separation of samples by smoking status

Extract and format phenotypic data

library(stringr)

phenotypic_data <- pData(gset)  # Extract phenotypic data

# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("characteristics_ch1.1", "group")

# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)

phenotypic_data <- phenotypic_data[,c(indexes)]

# Extract Age
phenotypic_data$age <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Age:)\\d+"))

# Extract Packyears
phenotypic_data$packyears <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Packyears:)\\d+"))

# Extract Time Since Quit Smoking (months)
phenotypic_data$TSQ_months <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Time Since Quit Smoking \\(months\\):)\\d+"))

# Delete the original column with the unseparated info
phenotypic_data <- phenotypic_data[,-1]

# Convert the NA values for packyears for never smokers to zero (this makes sense since the never smokers have 0 pack years)
phenotypic_data$packyears[phenotypic_data$group=="never_smoker"] <- 0

# Convert the NA values for TSQ_months to zero for current smokers (again makes sense)
phenotypic_data$TSQ_months[phenotypic_data$group=="current_smoker"] <- 0

# Make column to denote just former smoking status for the linear model
phenotypic_data$former_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "former_smoker"))

# Make column to denote just current smoking status for the linear model
phenotypic_data$current_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "current_smoker"))

# Make column to denote just never smoking status for the linear model
phenotypic_data$never_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "never_smoker"))

4.2.3 Plot PCA using other phenotypic data

## Plot PCA using age to define color
# Create a gradient color palette (light blue to dark blue)
palette <- colorRampPalette(c("lightblue", "darkblue"))

## Plot PCA of age ##
colz_age <- palette(length(phenotypic_data$age))[rank(phenotypic_data$age)]  # Map ages to gradient colors
plotMDS(exprs(gset),
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher age)",
        col = colz_age,
        pch = 1
        )
# Add a color bar for age
legend("topright", legend = range(phenotypic_data$age), 
       fill = palette(2), 
       title = "Age")

# Does not seem to be an age effect




### Plot PCA of packyears ###

# Excluding packyears of zero (never smokers)
pheno_packyears <- phenotypic_data[phenotypic_data$packyears!=0,]
exprs_packyears <- as.data.frame(exprs(gset)) %>%
  dplyr::select(rownames(pheno_packyears))

colz_packyears <-  palette(length(pheno_packyears$packyears))[rank(pheno_packyears$packyears)] # Map packyears to gradient colors
plotMDS(exprs_packyears,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher packyears)",
        col = colz_packyears,
        pch = 1
        #labels = gs
        )
# Add a color bar for packyears
legend("topright", legend = range(pheno_packyears$packyears), 
       fill = palette(2), 
       title = "Packyears")

## Does not seem to be packyears effect


### Plot PCA of time since quitting ###
pheno_tsq <- phenotypic_data[!is.na(phenotypic_data$TSQ_months),]
exprs_tsq <- as.data.frame(exprs(gset)) %>%
  dplyr::select(rownames(pheno_tsq))

colz_TSQ <- palette(length(pheno_tsq$TSQ))[rank(pheno_tsq$TSQ)]  # Map packyears to gradient colors
plotMDS(exprs_tsq,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ more time since quitting)",
        col = colz_TSQ,
        pch = 1
        #labels = gs
        )
legend("topright", legend = range(pheno_tsq$TSQ), 
       fill = palette(2), 
       title = "TSQ")

## Does not seem to be TSQ effect

This lack of separation by smoking status and inability to account for the variation by other phenotypic variables could potentially be problematic, but I propose/hypothesize that if the genes determined to be “persistent” can differentiate between the groups as expected in PCA, it will be evidence that the results are valid despite the groups not being differentiated by all the genes taken as a whole.

Note that I began trying to do this analysis accounting for pack years and TSQ (see other script), but for now I am just looking at the smoking status comparisons alone.

4.3 Differential expression analysis

v <- vooma(gset, design, plot=T)

v$genes <- fData(gset) # attach gene annotations


# fit linear model
fit  <- lmFit(v)

# set up contrasts of interest and recalculate model coefficients
#cts <- c(paste(groups[1],"-",groups[2],sep=""), paste(groups[1],"-",groups[3],sep=""), paste(groups[2],"-",groups[3],sep=""))
#cont.matrix <- makeContrasts(contrasts=cts, levels=design)
cont.matrix <- makeContrasts(
  CS_vs_NS = current_smoker - never_smoker,
  FS_vs_NS = former_smoker - never_smoker,
  CS_vs_FS = current_smoker - former_smoker,
  levels = design
)

fit2 <- contrasts.fit(fit, cont.matrix)


# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, proportion = 0.01) # Proportion is "assumed proportion of genes which are differentially expressed"

4.4 Select “persistent” DEGs, and basic filter (keep lower FDR of duplicates, apply FDR < 0.05 cutoff)

library(dplyr)
library(VennDiagram)

## Separate out genes that are DEGS in CS vs NS and FS vs NS

## Note: I have decided not to filter out genes that are significantly different between CS and FS because I realized that doesn't make logical sense.

# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", 
                  p.value=0.05, 
                  lfc=0)
# Note that this already has a p-value cutoff of 0.05 unlike my other datasets, but this is ok because this is the least stringent possible cutoff we will use anyway

# Venn diagram of results
vennDiagram(dT)

# Select the genes differentially expressed in both CS_vs_NS and FS_vs_NS
dT_persistent <- dT %>%
  as.data.frame(.) %>%
  filter(CS_vs_NS != 0) %>% # Differentially expressed in CS vs NS
  filter(CS_vs_NS == FS_vs_NS)# Differentially expressed, same sign in FS vs NS
nrow(dT_persistent) # 128 genes indeed
## [1] 87
# Get the toptable format for all genes
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) # Inf shows all the significant genes

# Filter to the "persistent" genes
tT_persistent <- tT %>%
  filter(ID %in% rownames(dT_persistent))

# Filter out blanks, keep lower FDR of ties
tT_persistent <- tT_persistent %>%
  filter(Gene.symbol != "") %>% # Remove blank gene symbols
  group_by(Gene.symbol) %>%
  slice_min(adj.P.Val, with_ties = TRUE) %>% 
  # For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
  ungroup()
nrow(tT_persistent)
## [1] 77
# Checking for ties
ties <- tT_persistent%>%
  group_by(Gene.symbol) %>%
  filter(n() > 1) %>%
  ungroup()
print(ties)
## # A tibble: 0 × 28
## # ℹ 28 variables: ID <chr>, Gene.title <chr>, Gene.symbol <chr>, Gene.ID <chr>,
## #   UniGene.title <chr>, UniGene.symbol <chr>, UniGene.ID <chr>,
## #   Nucleotide.Title <chr>, GI <int>, GenBank.Accession <chr>,
## #   Platform_CLONEID <lgl>, Platform_ORF <lgl>, Platform_SPOTID <chr>,
## #   Chromosome.location <chr>, Chromosome.annotation <chr>, GO.Function <chr>,
## #   GO.Process <chr>, GO.Component <chr>, GO.Function.ID <chr>,
## #   GO.Process.ID <chr>, GO.Component.ID <chr>, CS_vs_NS <dbl>, …
# As there is a tie with MUCA5 I will remove the MUCA5 probe with an "x" label for cross-reactivity
tT_persistent <- tT_persistent %>% filter (ID != "214303_x_at")

#Pick the columns we care about
GSE7895_persistent_DEGs <- tT_persistent %>%
  dplyr::select(., Gene.symbol, CS_vs_NS, FS_vs_NS, CS_vs_FS, adj.P.Val) %>%
  dplyr::rename(., Gene = Gene.symbol, CS_NS_A2 = CS_vs_NS, FS_NS_A2 = FS_vs_NS, CS_FS_A2 = CS_vs_FS, FDR_A2 = adj.P.Val)

# Save output
#write.table(GSE7895_persistent_DEGs, "../2_Outputs/1_Airway_DEGs/GSE7895_persistent_DEGs_no_quantile_20241216.txt")

4.5 Extra check (PCA for stratification of samples based on persistent genes)

## Filter exprs to the "persistent" genes
exprs_persistent <- as.data.frame(exprs(gset)) %>%
  filter(rownames(.) %in% tT_persistent$ID)

## Plot PCA ##
colz<- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs_persistent,
        gene.selection = "common",
        main = "PCA for GSE7895 with persistent genes",
        col = colz,
        pch = 1
        #labels = gs
        )

legend("topright", legend = levels(as.factor(gs)), 
       fill = unique(colz), 
       title = "Smoking status")

# You can see more separation happening, but I would expect to see current and former smokers more mixed together, whereas we see former and never smokers more mixed together. Hmm okay, interesting at least.

# Might be good to check on the age, packyears and TSQ here as well?


## Plot PCA of age ##
colz_age <- palette(length(phenotypic_data$age))[rank(phenotypic_data$age)]  # Map ages to gradient colors
plotMDS(exprs_persistent,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher age)",
        col = colz_age,
        pch = 16
        )
# Add a color bar for age
legend("topright", legend = range(phenotypic_data$age), 
       fill = palette(2), 
       title = "Age")

# Does not seem to be an age effect

### Plot PCA of packyears ###

# Excluding packyears of zero (never smokers)
exprs_persistent_packyears <- as.data.frame(exprs_persistent) %>%
  dplyr::select(rownames(pheno_packyears))

colz_packyears <-  palette(length(pheno_packyears$packyears))[rank(pheno_packyears$packyears)] # Map packyears to gradient colors
plotMDS(exprs_persistent_packyears,
        gene.selection = "common",
        main = "PCA for GSE7895 persistent genes (darker blue ~ higher packyears)",
        col = colz_packyears,
        pch = 16
        #labels = gs
        )
# Add a color bar for packyears
legend("bottomleft", legend = range(pheno_packyears$packyears), 
       fill = palette(2), 
       title = "Packyears")

## Maybe some sort of packyears effect happening, not obviously so


### Plot PCA of time since quitting ###
exprs_persistent_tsq <- as.data.frame(exprs_persistent) %>%
  dplyr::select(rownames(pheno_tsq))

colz_TSQ <- palette(length(pheno_tsq$TSQ))[rank(pheno_tsq$TSQ)]  # Map packyears to gradient colors
plotMDS(exprs_persistent_tsq ,
        gene.selection = "common",
        main = "PCA for GSE7895 persistent genes (darker blue ~ more months since quitting)",
        col = colz_TSQ,
        pch = 16
        #labels = gs
        )
legend("bottomleft", legend = range(pheno_tsq$TSQ), 
       fill = palette(2), 
       title = "TSQ")

## Maybe some TSQ effect but not super obvious